Introduction

This IPython notebook illustrates how to use multiple blockers and combine the results.

First, we need to import py_entitymatching package and other libraries as follows:



In [1]:

    
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd

Then, read the (sample) input tables for blocking purposes.



In [2]:

    
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'



In [3]:

    
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')

Combining Multiple Blockers



In [5]:

    
#Blocking plan :

#A, B --overlap blocking--> candset --attr-equiv-block--> candset  |
#                                                                  | 
#A, B ------------rule-based-blocking--------------------> candset |----union--->candset
#                                                                  |
#A, B -----------black-box-blocking----------------------> candset |



In [6]:

    
# Overlap blocking over input tables
ob = em.OverlapBlocker()
# block using name
C = ob.block_tables(A, B, 'name', 'name', word_level=True, overlap_size=1, 
                    l_output_attrs=['name', 'birth_year'], 
                    r_output_attrs=['name', 'birth_year'],
                    show_progress=False)
C









    Out[6]:







  
    
      
      _id
      ltable_ID
      rtable_ID
      ltable_name
      ltable_birth_year
      rtable_name
      rtable_birth_year
    
  
  
    
      0
      0
      a3
      b2
      William Bridge
      1986
      Bill Bridge
      1986
    
    
      1
      1
      a2
      b3
      Michael Franklin
      1988
      Mike Franklin
      1988
    
    
      2
      2
      a5
      b5
      Alphonse Kemper
      1984
      Alfons Kemper
      1984
    
    
      3
      3
      a2
      b6
      Michael Franklin
      1988
      Michael Brodie
      1987



In [7]:

    
# Overlap blocking over input tables
ob = em.OverlapBlocker()
# block using name
C = ob.block_tables(A, B, 'name', 'name', word_level=True, overlap_size=1, 
                    l_output_attrs=['name', 'birth_year'], 
                    r_output_attrs=['name', 'birth_year'],
                    show_progress=False)
C









    Out[7]:







  
    
      
      _id
      ltable_ID
      rtable_ID
      ltable_name
      ltable_birth_year
      rtable_name
      rtable_birth_year
    
  
  
    
      0
      0
      a3
      b2
      William Bridge
      1986
      Bill Bridge
      1986
    
    
      1
      1
      a2
      b3
      Michael Franklin
      1988
      Mike Franklin
      1988
    
    
      2
      2
      a5
      b5
      Alphonse Kemper
      1984
      Alfons Kemper
      1984
    
    
      3
      3
      a2
      b6
      Michael Franklin
      1988
      Michael Brodie
      1987



In [8]:

    
# Attribute equivalence blocking: block C using birth_year
ab = em.AttrEquivalenceBlocker()
D = ab.block_candset(C, 'birth_year', 'birth_year', show_progress=False)



In [9]:

    
D









    Out[9]:







  
    
      
      _id
      ltable_ID
      rtable_ID
      ltable_name
      ltable_birth_year
      rtable_name
      rtable_birth_year
    
  
  
    
      0
      0
      a3
      b2
      William Bridge
      1986
      Bill Bridge
      1986
    
    
      1
      1
      a2
      b3
      Michael Franklin
      1988
      Mike Franklin
      1988
    
    
      2
      2
      a5
      b5
      Alphonse Kemper
      1984
      Alfons Kemper
      1984



In [10]:

    
# Rule-based blocking over input tables
# first get features that can be used
feature_table = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)



In [11]:

    
# Create rule-based blocker
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], feature_table)









    Out[11]:





'_rule_0'



In [12]:

    
E = rb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'], show_progress=False)



In [13]:

    
# Apply black box blocker
# Create black box blocker
bb = em.BlackBoxBlocker()



In [14]:

    
# define a black box function. 
# The blocker function should drop tuple pairs whose last name do not match
# The function has to do the following steps
#  1) Get name attributes from each of the tuples
#  2) Split name attribute to get last name
#  3) if last names donot match return True



In [15]:

    
def my_function(x, y):
    # x, y will be of type pandas series
    
    # get name attribute
    x_name = x['name']
    y_name = y['name']
    # get last names
    x_name = x_name.split(' ')[1]
    y_name = y_name.split(' ')[1]
    # check if last names match
    if x_name != y_name:
        return True
    else:
        return False



In [16]:

    
bb.set_black_box_function(my_function)



In [17]:

    
F = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'], show_progress=False)



In [18]:

    
F









    Out[18]:







  
    
      
      _id
      ltable_ID
      rtable_ID
      ltable_name
      rtable_name
    
  
  
    
      0
      0
      a2
      b3
      Michael Franklin
      Mike Franklin
    
    
      1
      1
      a3
      b2
      William Bridge
      Bill Bridge
    
    
      2
      2
      a5
      b5
      Alphonse Kemper
      Alfons Kemper



In [19]:

    
# Combine all the blocker outputs
G = em.combine_blocker_outputs_via_union([D, E, F])



In [20]:

    
G









    Out[20]:







  
    
      
      _id
      ltable_ID
      rtable_ID
      ltable_name
      ltable_birth_year
      rtable_name
      rtable_birth_year
    
  
  
    
      0
      0
      a2
      b3
      Michael Franklin
      1988
      Mike Franklin
      1988
    
    
      1
      1
      a2
      b6
      Michael Franklin
      1988
      Michael Brodie
      1987
    
    
      2
      2
      a3
      b2
      William Bridge
      1986
      Bill Bridge
      1986
    
    
      3
      3
      a3
      b6
      William Bridge
      1986
      Michael Brodie
      1987
    
    
      4
      4
      a4
      b2
      Binto George
      1987
      Bill Bridge
      1986
    
    
      5
      5
      a5
      b5
      Alphonse Kemper
      1984
      Alfons Kemper
      1984



In [21]:

    
em.show_properties(G)









    



id: 4547208976
rtable(obj.id): 4546836464
key: _id
fk_rtable: rtable_ID
fk_ltable: ltable_ID
ltable(obj.id): 4546835680



In [ ]:

	_id	ltable_ID	rtable_ID	ltable_name	ltable_birth_year	rtable_name	rtable_birth_year
0	0	a3	b2	William Bridge	1986	Bill Bridge	1986
1	1	a2	b3	Michael Franklin	1988	Mike Franklin	1988
2	2	a5	b5	Alphonse Kemper	1984	Alfons Kemper	1984
3	3	a2	b6	Michael Franklin	1988	Michael Brodie	1987